동적 풀기

동적인 크기를 가지는 컨테이너의 가속화는 고정 크기 최적화만큼 중요하며, 훨씬 더 많이 필요하다.

벡터 예제

연산) u=3v+w;

// 벤치 마킹 코드

#include <iostream>

#include <chrono>

#include <vector>

using namespace std::chrono;

template <typename TP>

double duration_ms(const TP& from, const TP& to, unsigned rep){

return duration_cast<nanoseconds>((to-from)/rep).count()/1000.;

}

int main(int argc, char* argv[]){

unsigned s=1000;

if(argc>1)s=atoi(argv[1]);

std::vector<float> u(s), v(s), w(s);

for(unsigned i=0; i<s; ++i){

v[i]=float(i);

w[i]=float(2*i+15);

}

// L1 or L2 cache load

for(unsigned j=0; j<3; j++)

for(unsigned i=0; i<s; ++i)

u[i]=3.0f*v[i]+w[i];

const unsigned rep=200000;

using TP=time_point<steady_clock>;

TP unr=steady_clock::now();

for(unsigned j=0; j<rep; j++)

for(unsigned i=0; i<s; ++i)

u[i]=3.0f*v[i]+w[i];

TP unr_end=steady_clock::now();

std::cout<<"Compute time native loop is "<<duration_ms(unr, unr_end, rep)<<" µs.\n";

return 0;

}

중간에 for 연산을 통해서 벡터 u, v, w를 레지스터에 저장하도록 유도한다.

연산에서 간단한 반복을 각 반복에서 4번의 연산을 수행하는 반복문으로 변경해서 속도를 비교할 수 있다.

// 벤치 마킹 코드

#include <iostream>

#include <chrono>

#include <vector>

using namespace std::chrono;

template <typename TP>

double duration_ms(const TP& from, const TP& to, unsigned rep){

return duration_cast<nanoseconds>((to-from)/rep).count()/1000.;

}

int main(int argc, char* argv[]){

unsigned s=1000;

if(argc>1)s=atoi(argv[1]);

std::vector<float> u(s), v(s), w(s);

for(unsigned i=0; i<s; ++i){

v[i]=float(i);

w[i]=float(2*i+15);

}

// L1 or L2 cache load

for(unsigned j=0; j<3; j++)

for(unsigned i=0; i<s; ++i)

u[i]=3.0f*v[i]+w[i];

const unsigned rep=200000;

using TP=time_point<steady_clock>;

TP unr=steady_clock::now();

for(unsigned j=0; j<rep; j++){

unsigned sb=s/4+4;

for(unsigned i=0; i<sb; i+=4){

u[i]=3.0f*v[i]+w[i];

u[i+1]=3.0f*v[i+1]+w[i+1];

u[i+2]=3.0f*v[i+2]+w[i+2];

u[i+3]=3.0f*v[i+3]+w[i+3];

}

for(unsigned i=sb; i<s; ++i)

u[i]=3.0f*v[i]+w[i];

}

TP unr_end=steady_clock::now();

std::cout<<"Compute time unrolled loop is "<<duration_ms(unr, unr_end, rep)<<" µs.\n";

return 0;

}

compile option

-ffast-math

allows optimizations that do not preserve strict IEEE compiliance

x=x*x*x*x*x*x*x*x; 를

x*=x;

x*=x; 로 변환

-funroll-loops

instructs the compiler to perform basic loop unrolling

결과(g++ -std=c++17 -ffast-math -DNDEBUG -o csian csian.cpp)

Compute time native loop is 3.485 µs.

Compute time unrolled loop is 3.486 µs.

결과(g++ -std=c++17 -funroll-loops -o csian csian.cpp)

Compute time native loop is 3.48 µs.

Compute time unroled loop is 3.472 µs.

최신 컴파일러에서는 거의 큰 차이를 느낄 수가 없음

컴퓨터는 프로세서마다 서로 다른 수의 레지스터를 가지고 있기 때문에

적당한 반복문을 풀어줘야 하는 크기는 다르다.(레지스터에 담을 수 있는 크기 선에서 결정)

또한 표현식의 타입에 따라서 필요한 레지스터의 수가 다르다.

벡터 표현식 풀기

my_axpy는 블록 크기를 받는 템플릿 인수를 가지는 템플릿 함수

block 크기만큼 반복문을 풀어씀

template <unsigned BSize, typename U, typename V, typename W>

void my_axpy(U& u, const V& v, const W& w){

assert(u.size()==v.size() && v.size()==w.size());

unsigned s=u.size(), sb=s/BSize*BSize;

for(unsigned i=0; i<sub; i+=BSize) my_axpy_ftor<0, BSize>()(u, v, w, i);

for(unsigned i=sb; i<s; ++i) u[i]=3.0f*v[i]+w[i];

}

template <unsigned Offset, unsigned Max>

struct my_axpr_ftor{

template <typename U, typename V, typename W>

void operator()(U& u, const V& v, const W& w, unsigned i){

u[i+Offset]=3.0f*v[i+Offset]+w[i+Offset];

my_axpy_ftor<Offset+1, Max>()(u, v, w, i);

}

};

template <unsigned Max>

struct my_axpr_ftor<Max, Max>{

template <typename U, typename V, typename W>

void operator()(U& u, const V& v, const W& w, unsigned i) {}

};

표현식 템플릿 튜닝

동일하게 벡터 덧셈을 위한 표현식 템플릿에서도 반복문을 operator=을 이용해서 분할할 수 있다.

u=v+v+w

template <typename T>

class vector{

template <typename Src>

vector& operator=(const Src& that){

check_size(size(that));

unsigned s=my_size, sb=s/4*4;

for(unsigned i=0; i<sb; i+=4) assign<0, 4>()(*this, that, i);

for(unsigned i=sb; i<s; ++i) data[i]=that[i];

return *this;

}

};

template <typename Offset, unsigned Max>

struct assgin{

template <typename U, typename V>

void operator()(U& u, const V& v, unsigned i){

u[u+Offset]=v[i+offSet];

assign<Offset+1, Max>()(u, v, i);

}

};

template <unsigned Max>

struct assign<Max, Max>{

template <typename U, typename V>

void operator()(U& u, const V& v, unsigned i)

{}

};

명시적으로 루프푸는 횟수를 지정(위의 코드는 무조건 4회씩 풀어서 연산)

template <unsigned BSize, typename Src>

vector& operator=(const Src& that){

check_size(size(that));

unsigned s=my_size, sb=s/BSize*BSize;

for(unsigned i=0; i<sb; i+=BSize) assign<0, BSize>()(*this, that, i);

for(unsigned i=sb; i<s; ++i) data[i]=that[i];

return *this;

}

하지만, 위와 같이 정의할 시, 더 이상 자연 이항 연산자(operator=) 기호를 사용할 수 없다.

u.operator=<4>(v+v+w); // BSize에 대한 템플릿 인자 명시 필요

위와 같은 표기법 대신, 아래와 같이 표현하기 위해 unroll_vector를 재 정의할 수 있다.

unroll<4>(u=v+v+w);

unroll<4>(u)=v+v+w;

후자에 대한 unroll_vector 함수

template <typename BSize, typename Vector>

unroll_vector<BSize, Vector> inline unroll(Vector& v){

return unroll_vector<BSize, Vector>(v);

}

template <unsigned BSize, typename Vector>

class unroll_vector{

public:

unroll_vector(V& ref): ref(ref) {}

template <typename Src>

V& operator=(const Src& that){

assert(size(ref))==size(that)

unsigned s=size(ref), sb=s/BSize*BSize;

for(unsigned i=0; i<sb; i+=BSize) assign<0, BSize>()(ref, that, i);

for(unsigned i=sb; i<sb; ++i) ref[i]=that[i];

return ref;

}

public:

V& ref;

};

Meta-Tunning 2